Using tidyr, dplyr, ggplot2, stringr, RCurl
UN Immigration Data
I will be using the data source from Subhalaxmi Rout regarding UN Migrartion Data.
This data set is huge, so I’m only looking at one table, which cooresponds to the year 1990. Once a pipline can be established, I can tidy a few more tables, join them and look at some trends. At first, here is a look at the data in a mostly raw form.
Load Data
m<-getURL(
"https://raw.githubusercontent.com/Shampjeff/cuny_msds/master/DATA_607/data/un_immg.csv")
df_immg<-read.csv(text=m, header = T, na.strings = "", skip = 14)Looking at the initial state of the data.
DT::datatable(df_immg[1:5,1:5],
extensions = c('FixedColumns',"FixedHeader"),
options = list(scrollX = TRUE,
paging=TRUE,
fixedHeader=TRUE))I’ll need to rename columns so I am re-using a function I wrote previously to replace the column names with a value found in a given row.
rename.columns<-function(df, row_idx, col_start, name_join=F){
for (i in col_start:ncol(df)){
prefix<-c(str_remove(names(df[i]), "[.]\\d"), df[row_idx,i])
join_name<-paste(prefix, collapse="_")
if (name_join == FALSE){
names(df)[names(df) == names(df[i])]<-df[row_idx,i]
} else {
names(df)[names(df) == names(df[i])]<-join_name
}
}
return(df)
}Destination is where people ended up, country of origin is the total number of people who came to the destination column, the rest of the columns outline the numbers from each other country recognized by the UN. These are to columns I will tidy.
df_immg<-df_immg %>%
select(Major.area..region..country.or.area.of.destination,
Country.of.origin,
X.2:ncol(df_immg)) %>%
mutate_if(is.factor, as.character) %>% # change data types
rename( # replace unruly names
destination = Major.area..region..country.or.area.of.destination,
inbound_total = Country.of.origin) %>%
replace(is.na(.), 0) # replacing NA with zero
DT::datatable(df_immg[1:5,1:5],
extensions = c('FixedColumns',"FixedHeader"),
options = list(scrollX = TRUE,
paging=TRUE,
fixedHeader=TRUE))Again, we need to rename the columns based on a row value.
df_immg<-rename.columns(df_immg,
row_idx = 1,
col_start = 3)
df_immg<-df_immg[2:length(df_immg), 1:234] # drop that last 10 rows (NAs)
df_immg$year = 1990
DT::datatable(df_immg[1:5,1:4],
extensions = c('FixedColumns',"FixedHeader"),
options = list(scrollX = TRUE,
paging=TRUE,
fixedHeader=TRUE))This dataset is almost square, to make it tidy I think we want to pivot this to three columns: to, form, total moved. Once in a tidy format, we can remove whitespace from the numbers and cast them as numerics.
df_immg<-df_immg %>%
pivot_longer( # gather columns to rows
-c("destination",
"inbound_total",
"year"),
names_to = "origin", # naming params
values_to = "total",
names_repair = "unique") %>%
mutate(total = str_remove_all(total, " ")) %>%
mutate(inbound_total = str_remove_all(inbound_total, " ")) %>%
mutate_at(.vars=vars(total, inbound_total), .funs = as.numeric)
DT::datatable(df_immg,
extensions = c('FixedColumns',"FixedHeader"),
options = list(scrollX = TRUE,
paging=TRUE,
fixedHeader=TRUE))57,536 rows with many repeat values and destinations. Is this really more tidy?
Migration Data Pipeline
Now that our process is in place, I’ll bring in and tidy several other years; 2000, 2005, 2010, and 2015.
urls<-c(
"https://raw.githubusercontent.com/Shampjeff/cuny_msds/master/DATA_607/data/un_immg_2k.csv",
"https://raw.githubusercontent.com/Shampjeff/cuny_msds/master/DATA_607/data/un_immg_2005.csv",
"https://raw.githubusercontent.com/Shampjeff/cuny_msds/master/DATA_607/data/un_immg_2010.csv",
"https://raw.githubusercontent.com/Shampjeff/cuny_msds/master/DATA_607/data/un_immg_2015.csv")
data_years<-c(2000,2005,2010,2015)
for (i in 1:length(data_years)){
d<-getURL(urls[i])
df_immg_i<-read.csv(text=d, header = T, na.strings = "", skip = 14)
df_immg_i<-df_immg_i %>%
select(Major.area..region..country.or.area.of.destination,
Country.of.origin,
X.2:ncol(df_immg_i)) %>%
mutate_if(is.factor, as.character) %>% # change data types
rename( # replace unruly names
destination = Major.area..region..country.or.area.of.destination,
inbound_total = Country.of.origin) %>%
replace(is.na(.), 0) # replacing NA with zero
df_immg_i<-rename.columns(df_immg_i,
row_idx = 1,
col_start = 3)[2:length(df_immg_i), 1:234]
df_immg_i<-df_immg_i %>%
pivot_longer( # gather columns to rows
-c("destination",
"inbound_total"),
names_to = "origin", # naming params
values_to = "total",
names_repair = "unique") %>%
mutate(total = str_remove_all(total, " ")) %>%
mutate(inbound_total = str_remove_all(inbound_total, " ")) %>%
mutate_at(.vars=vars(total, inbound_total), .funs = as.numeric) %>%
mutate(year = data_years[i])
df_immg<-rbind(df_immg, df_immg_i)
}